import torch
import matplotlib.pyplot as plt
from IPython.display import Audio
#TODO: changeme
%cd /Users/janne/git/tutorial/codes
# from codes.data_loader import GTZANLoader
from utils import plot_spectrogram
/Users/janne/git/tutorial/codes
Audio Data Augmentations¶
In this chapter, we will discuss common transformations that we can apply to audio signals in the time domain. We will refer to these as “audio data augmentations”.
Data augmentations are a set of methods that add modified copies to a dataset, from the existing data. This process creates many variations of natural data, and can act as a regulariser to reduce the problem of overfitting. It can also help deep neural networks become robust to complex variations of natural data, which improves their generalisation performance.
In the field of computer vision, the transformations that we apply to images are often very self-explanatory. Take this image, for example. It becomes fairly obvious that we have applied various amounts of gaussian blurring on this image.

Naturally, we cannot translate transformations from the vision domain directly to the audio domain. Before we explore a battery of audio data augmentations, we now list the currently available code libraries:
Code Libraries¶
Name |
Author |
Framework |
Language |
License |
Link |
|---|---|---|---|---|---|
Muda |
B. McFee et al. (2015) |
General Purpose |
Python |
ISC License |
|
Audio Degradation Toolbox |
M. Mauch et al. (2013) |
General Purpose |
MATLAB |
GNU General Public License 2.0 |
|
rubberband |
- |
General Purpose |
C++ |
GNU General Public License (non-commercial) |
|
audiomentations |
I. Jordal (2021) |
General Purpose |
Python |
MIT License |
|
tensorflow-io |
tensorflow.org |
TensorFlow |
Python |
Apache 2.0 License |
|
torchaudio |
pytorch.org |
PyTorch |
Python |
BSD 2-Clause “Simplified” License |
|
torch-audiomentations |
Asteroid (2021) |
PyTorch |
Python |
MIT License |
|
torchaudio-augmentations |
J. Spijkervet (2021) |
PyTorch |
Python |
MIT License |
Listening¶
One of the most essential, and yet overlooked, parts of music research is exploring and observing the data. This also applies to data augmentation research: one has to develop a general understanding of the effect of transformations that can be applied to audio. Even more so, when transformations are applied sequentially.
For instance, we will understand why a reverb applied before a frequency filter will sound different than when the reverb is applied after the frequency filter. Before we develop this intuition, let’s listen to a series of audio data augmenations.
from torchaudio.datasets import GTZAN
dataset = GTZAN(root=".", download=True)
idx = 5
print(f"Number of datapoints in the GTZAN dataset: f{len(dataset)}\n")
print(f"Selected track no.: {idx}")
audio, sr, genre = dataset[idx]
print(f"Genre: {genre}\nSample rate: {sr}\nChannels: {audio.shape[0]}\nSamples: {audio.shape[1]}")
display(Audio(audio, rate=sr))
Number of datapoints in the GTZAN dataset: f1000
Selected track no.: 5
Genre: blues
Sample rate: 22050
Channels: 1
Samples: 661794
Random Crop¶
Similar to how we can crop an image, so that only a subset of the image is represented, we can ‘crop’ a piece of audio by selecting a fragment between two time points $t_0 - t_1$.
Various terms for this exist, e.g.,: slicing, trimming,
Frequency Filter¶
Note
In these examples and the accompanying code, we assume the shape of audio ordered in our array is follows: (channel, time)
from torch_audiomentations import LowPassFilter
taudio = LowPassFilter(
sample_rate=sr,
p=1.0,
min_cutoff_freq=3000,
max_cutoff_freq=3001,
)(audio.unsqueeze(0)).squeeze(0)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)
print("LowPassFilter")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
LowPassFilter
Delay¶
from torchaudio_augmentations import Delay
taudio = Delay(sample_rate=sr, min_delay=200, max_delay=201)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)
print(f"Delay of {200}ms")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
Delay of 200ms
Comb filter¶
When we apply a delayed signal to the original with a short timespan, it will cause interferences
from torchaudio_augmentations import Delay
taudio = Delay(sample_rate=sr, min_delay=60, max_delay=61)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)
print(f"Delay of {61}ms")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
Delay of 61ms
Pitch Shift¶
from torchaudio_augmentations import PitchShift
taudio = PitchShift(sample_rate=sr, n_samples=audio.shape[1], pitch_cents_min=4, pitch_cents_max=5)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")
print(f"Pitch shift of {4} semitones")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr, title="Pitch shift")
Original
Pitch shift of 4 semitones
Reverb¶
from torchaudio_augmentations import Reverb
taudio = Reverb(sample_rate=sr, reverberance_min=90, reverberance_max=91, room_size_min=90, room_size_max=91)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")
print(f"Reverb")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr, title="Reverb")
Original
Reverb
Gain¶
Warning
In Jupyter notebook’s Audio() object, we have to set normalize=False so that we can hear an unnormalized version of the audio. This is important to reflect the true audio transformation output.
from torchaudio_augmentations import Gain
taudio = Gain(min_gain=-16, max_gain=-15)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")
print(f"Gain")
display(Audio(taudio, rate=sr, normalize=False))
plot_spectrogram(taudio, sr, title="Gain")
Original
Gain
Noise¶
from torchaudio_augmentations import Noise
taudio = Noise(min_snr=0.04, max_snr=0.04)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")
print(f"Noise")
display(Audio(taudio, rate=sr, normalize=True))
plot_spectrogram(taudio, sr, title="Noise")
Original
Noise
Polarity Inversion¶
import math
l = 1/440.0
test_audio = torch.sin(math.tau * 440.0 * torch.linspace(0, l, int(l*sr))).unsqueeze(0)
plt.plot(test_audio.squeeze(0))
plt.grid()
plt.xticks([])
plt.show()
inverted_test_audio = PolarityInversion()(test_audio)
plt.plot(inverted_test_audio.squeeze(0))
plt.grid()
plt.xticks([])
plt.show()
from torchaudio_augmentations import PolarityInversion
taudio = PolarityInversion()(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")
print(f"Polarity Inversion")
display(Audio(taudio, rate=sr, normalize=True))
plot_spectrogram(taudio, sr, title="Polarity Inversion")
print(f"Original + Polarity Inversion")
display(Audio(audio + taudio, rate=sr, normalize=True))
plot_spectrogram(audio + taudio, sr, title="Original + Polarity Inversion")
Original
Polarity Inversion
Original + Polarity Inversion
Sequential Audio Data Augmentations¶
Now that we have built up some intuition of some of the audio transformations, let us observe how they can be applied sequentially. More importantly, to develop an understanding on how different audio transformations interact when we apply them before, or after each other.
For this, we can use a Compose module, which takes as input a list of audio transformations. These will be applied in the order they appear in the supplied list. This interface is similar to torchvision.transforms and torchaudio.transforms’ Compose modules.
from torchaudio_augmentations import Compose, HighLowPass
transform = Compose([
Delay(sample_rate=sr),
HighLowPass(sample_rate=sr)
])
transformed_audio = transform(audio)
print("Original:")
display(Audio(audio, rate=sr))
print("Transform:", transform)
display(Audio(transformed_audio, rate=sr))
Original:
Transform: Compose(
Delay()
HighLowPass()
)
Now that we have listened to what a sequential audio transformation sounds like, let’s observe how two different transforms interact when they are applied in a different sequential order.
Let’s take the following two transforms:
NoiseReverb
A signal that does not have any reverberation added, is commonly called a dry signal. A signal that is reverberated is called a wet signal.
When we first apply the Noise transform, the Reverb transform will apply the reverberation to the dry signal and the added noise signal. This will result in a completely wet signal.
Conversely, when we first apply the Reverb transform, the Noise signal will be added after the reverberated signal. The noise is thus dry, i.e., it is not reverberated.
from torchaudio_augmentations import Compose
noise = Noise(min_snr=0.05, max_snr=0.06)
reverb = Reverb(sample_rate=sr, reverberance_min=80, reverberance_max=81, dumping_factor_min=0, dumping_factor_max=1, room_size_min=80, room_size_max=81)
transform1 = Compose([noise, reverb])
transform2 = Compose([reverb, noise])
print("Transform 1:", transform1)
taudio1 = transform1(audio)
taudio2 = transform2(audio)
display(Audio(taudio1, rate=sr))
plot_spectrogram(taudio1, sr, title="Transform 1")
print("Transform:", transform2)
display(Audio(taudio2, rate=sr))
plot_spectrogram(taudio2, sr, title="Transform 2")
Transform 1: Compose(
Noise()
Reverb()
)
Transform: Compose(
Reverb()
Noise()
)
More Sequential Audio Data Augmentations¶
Let’s continue to develop our intuition for sequential audio transformations a bit more in the following examples:
# 4 seconds of audio
num_samples = sr * 4
transforms = [
RandomResizedCrop(n_samples=num_samples),
HighLowPass(
sample_rate=sr,
lowpass_freq_low=2200,
lowpass_freq_high=4000,
highpass_freq_low=200,
highpass_freq_high=1200,
),
Delay(
sample_rate=sr,
volume_factor=0.5,
min_delay=100,
max_delay=500,
delay_interval=1,
)
]
transform = Compose(transforms)
print("Transform:", transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))
Transform: Compose(
RandomResizedCrop()
HighLowPass()
Delay()
)
Instead of retrieving a single augmented example, let’s return 4 different views of the original sound:
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
plot_spectrogram(ta, sr, title="")
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
HighLowPass()
Delay()
)
Stochastic Audio Data Augmentations¶
transforms = [
PolarityInversion(),
PitchShift(sample_rate=sr, n_samples=audio.shape[1]),
Reverb(sample_rate=sr)
]
stochastic_transforms = [
RandomApply(transforms, p=0.5)
]
transform = Compose(stochastic_transforms)
print(transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))
Audio chain stochastic augmentations¶
from torchaudio_augmentations import RandomApply
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
# 4 seconds of audio
num_samples = sr * 4
stochastic_transforms = [
RandomResizedCrop(n_samples=num_samples),
# apply with p = 0.3
RandomApply([
PolarityInversion(),
HighLowPass(
sample_rate=sr,
lowpass_freq_low=2200,
lowpass_freq_high=4000,
highpass_freq_low=200,
highpass_freq_high=1200,
),
Delay(
sample_rate=sr,
volume_factor=0.5,
min_delay=100,
max_delay=500,
delay_interval=1,
),
],
p=0.3),
# apply with p = 0.8
RandomApply([
PitchShift(sample_rate=sr, n_samples=num_samples),
Gain(),
Noise(max_snr=0.01),
Reverb(sample_rate=sr)
],
p=0.8)
]
transform = ComposeMany(stochastic_transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
for ta in transformed_audio:
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
RandomApply(
p=0.3
PolarityInversion()
HighLowPass()
Delay()
)
RandomApply(
p=0.8
<torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda773b7d60>
Gain()
Noise()
Reverb()
)
)
Single stochastic augmentations¶
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
# 4 seconds of audio
num_samples = sr * 4
# define our stochastic augmentations
transforms = [
RandomResizedCrop(n_samples=num_samples),
RandomApply([PolarityInversion()], p=0.8),
RandomApply([HighLowPass(sample_rate=sr)], p=0.6),
RandomApply([Delay(sample_rate=sr)], p=0.6),
RandomApply([PitchShift(sample_rate=sr, n_samples=num_samples)], p=0.3),
RandomApply([Gain()], p=0.6),
RandomApply([Noise(max_snr=0.01)], p=0.3),
RandomApply([Reverb(sample_rate=sr)], p=0.5)
]
transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
plot_spectrogram(ta, sr, title=e="")
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
RandomApply(
p=0.8
PolarityInversion()
)
RandomApply(
p=0.6
HighLowPass()
)
RandomApply(
p=0.6
Delay()
)
RandomApply(
p=0.3
<torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda7980f1f0>
)
RandomApply(
p=0.6
Gain()
)
RandomApply(
p=0.3
Noise()
)
RandomApply(
p=0.5
Reverb()
)
)